importing libraries¶

In [2]:
# Importing essential libraries
import numpy as np
import pandas as pd

# Visualization tools for exploratory data analysis (EDA)
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px

# Evaluation metric libraries for model performance
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score, recall_score, 
    f1_score, classification_report, mean_absolute_error,  accuracy_score,
    mean_squared_error, r2_score
)

# Preprocessing tools
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Libraries for splitting the dataset and hyperparameter tuning
from sklearn.model_selection import (
    train_test_split, GridSearchCV, RandomizedSearchCV, 
    RepeatedStratifiedKFold
)
from sklearn.model_selection import cross_val_score

# Machine Learning models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier 
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb  # Xtreme Gradient Boosting (XGBoost)

# Ignoring warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Inline plotting for Jupyter Notebooks
%matplotlib inline
In [3]:
car=pd.read_csv(r"C:\Users\lakshita\Desktop\datasets\MLmodels\car_prediction\car_price_prediction.csv")
In [4]:
car.head
Out[4]:
<bound method NDFrame.head of      car_ID  symboling                   CarName fueltype aspiration  \
0         1          3        alfa-romero giulia      gas        std   
1         2          3       alfa-romero stelvio      gas        std   
2         3          1  alfa-romero Quadrifoglio      gas        std   
3         4          2               audi 100 ls      gas        std   
4         5          2                audi 100ls      gas        std   
..      ...        ...                       ...      ...        ...   
200     201         -1           volvo 145e (sw)      gas        std   
201     202         -1               volvo 144ea      gas      turbo   
202     203         -1               volvo 244dl      gas        std   
203     204         -1                 volvo 246   diesel      turbo   
204     205         -1               volvo 264gl      gas      turbo   

    doornumber      carbody drivewheel enginelocation  wheelbase  ...  \
0          two  convertible        rwd          front       88.6  ...   
1          two  convertible        rwd          front       88.6  ...   
2          two    hatchback        rwd          front       94.5  ...   
3         four        sedan        fwd          front       99.8  ...   
4         four        sedan        4wd          front       99.4  ...   
..         ...          ...        ...            ...        ...  ...   
200       four        sedan        rwd          front      109.1  ...   
201       four        sedan        rwd          front      109.1  ...   
202       four        sedan        rwd          front      109.1  ...   
203       four        sedan        rwd          front      109.1  ...   
204       four        sedan        rwd          front      109.1  ...   

     enginesize  fuelsystem  boreratio  stroke compressionratio horsepower  \
0           130        mpfi       3.47    2.68              9.0        111   
1           130        mpfi       3.47    2.68              9.0        111   
2           152        mpfi       2.68    3.47              9.0        154   
3           109        mpfi       3.19    3.40             10.0        102   
4           136        mpfi       3.19    3.40              8.0        115   
..          ...         ...        ...     ...              ...        ...   
200         141        mpfi       3.78    3.15              9.5        114   
201         141        mpfi       3.78    3.15              8.7        160   
202         173        mpfi       3.58    2.87              8.8        134   
203         145         idi       3.01    3.40             23.0        106   
204         141        mpfi       3.78    3.15              9.5        114   

     peakrpm citympg  highwaympg    price  
0       5000      21          27  13495.0  
1       5000      21          27  16500.0  
2       5000      19          26  16500.0  
3       5500      24          30  13950.0  
4       5500      18          22  17450.0  
..       ...     ...         ...      ...  
200     5400      23          28  16845.0  
201     5300      19          25  19045.0  
202     5500      18          23  21485.0  
203     4800      26          27  22470.0  
204     5400      19          25  22625.0  

[205 rows x 26 columns]>
In [5]:
car.describe(include='all')
Out[5]:
car_ID symboling CarName fueltype aspiration doornumber carbody drivewheel enginelocation wheelbase ... enginesize fuelsystem boreratio stroke compressionratio horsepower peakrpm citympg highwaympg price
count 205.000000 205.000000 205 205 205 205 205 205 205 205.000000 ... 205.000000 205 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000
unique NaN NaN 147 2 2 2 5 3 2 NaN ... NaN 8 NaN NaN NaN NaN NaN NaN NaN NaN
top NaN NaN toyota corona gas std four sedan fwd front NaN ... NaN mpfi NaN NaN NaN NaN NaN NaN NaN NaN
freq NaN NaN 6 185 168 115 96 120 202 NaN ... NaN 94 NaN NaN NaN NaN NaN NaN NaN NaN
mean 103.000000 0.834146 NaN NaN NaN NaN NaN NaN NaN 98.756585 ... 126.907317 NaN 3.329756 3.255415 10.142537 104.117073 5125.121951 25.219512 30.751220 13276.710571
std 59.322565 1.245307 NaN NaN NaN NaN NaN NaN NaN 6.021776 ... 41.642693 NaN 0.270844 0.313597 3.972040 39.544167 476.985643 6.542142 6.886443 7988.852332
min 1.000000 -2.000000 NaN NaN NaN NaN NaN NaN NaN 86.600000 ... 61.000000 NaN 2.540000 2.070000 7.000000 48.000000 4150.000000 13.000000 16.000000 5118.000000
25% 52.000000 0.000000 NaN NaN NaN NaN NaN NaN NaN 94.500000 ... 97.000000 NaN 3.150000 3.110000 8.600000 70.000000 4800.000000 19.000000 25.000000 7788.000000
50% 103.000000 1.000000 NaN NaN NaN NaN NaN NaN NaN 97.000000 ... 120.000000 NaN 3.310000 3.290000 9.000000 95.000000 5200.000000 24.000000 30.000000 10295.000000
75% 154.000000 2.000000 NaN NaN NaN NaN NaN NaN NaN 102.400000 ... 141.000000 NaN 3.580000 3.410000 9.400000 116.000000 5500.000000 30.000000 34.000000 16503.000000
max 205.000000 3.000000 NaN NaN NaN NaN NaN NaN NaN 120.900000 ... 326.000000 NaN 3.940000 4.170000 23.000000 288.000000 6600.000000 49.000000 54.000000 45400.000000

11 rows × 26 columns

In [6]:
car.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 17  fuelsystem        205 non-null    object 
 18  boreratio         205 non-null    float64
 19  stroke            205 non-null    float64
 20  compressionratio  205 non-null    float64
 21  horsepower        205 non-null    int64  
 22  peakrpm           205 non-null    int64  
 23  citympg           205 non-null    int64  
 24  highwaympg        205 non-null    int64  
 25  price             205 non-null    float64
dtypes: float64(8), int64(8), object(10)
memory usage: 41.8+ KB
In [7]:
print(car.dtypes)
car_ID                int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object
In [8]:
car.value_counts
Out[8]:
<bound method DataFrame.value_counts of      car_ID  symboling                   CarName fueltype aspiration  \
0         1          3        alfa-romero giulia      gas        std   
1         2          3       alfa-romero stelvio      gas        std   
2         3          1  alfa-romero Quadrifoglio      gas        std   
3         4          2               audi 100 ls      gas        std   
4         5          2                audi 100ls      gas        std   
..      ...        ...                       ...      ...        ...   
200     201         -1           volvo 145e (sw)      gas        std   
201     202         -1               volvo 144ea      gas      turbo   
202     203         -1               volvo 244dl      gas        std   
203     204         -1                 volvo 246   diesel      turbo   
204     205         -1               volvo 264gl      gas      turbo   

    doornumber      carbody drivewheel enginelocation  wheelbase  ...  \
0          two  convertible        rwd          front       88.6  ...   
1          two  convertible        rwd          front       88.6  ...   
2          two    hatchback        rwd          front       94.5  ...   
3         four        sedan        fwd          front       99.8  ...   
4         four        sedan        4wd          front       99.4  ...   
..         ...          ...        ...            ...        ...  ...   
200       four        sedan        rwd          front      109.1  ...   
201       four        sedan        rwd          front      109.1  ...   
202       four        sedan        rwd          front      109.1  ...   
203       four        sedan        rwd          front      109.1  ...   
204       four        sedan        rwd          front      109.1  ...   

     enginesize  fuelsystem  boreratio  stroke compressionratio horsepower  \
0           130        mpfi       3.47    2.68              9.0        111   
1           130        mpfi       3.47    2.68              9.0        111   
2           152        mpfi       2.68    3.47              9.0        154   
3           109        mpfi       3.19    3.40             10.0        102   
4           136        mpfi       3.19    3.40              8.0        115   
..          ...         ...        ...     ...              ...        ...   
200         141        mpfi       3.78    3.15              9.5        114   
201         141        mpfi       3.78    3.15              8.7        160   
202         173        mpfi       3.58    2.87              8.8        134   
203         145         idi       3.01    3.40             23.0        106   
204         141        mpfi       3.78    3.15              9.5        114   

     peakrpm citympg  highwaympg    price  
0       5000      21          27  13495.0  
1       5000      21          27  16500.0  
2       5000      19          26  16500.0  
3       5500      24          30  13950.0  
4       5500      18          22  17450.0  
..       ...     ...         ...      ...  
200     5400      23          28  16845.0  
201     5300      19          25  19045.0  
202     5500      18          23  21485.0  
203     4800      26          27  22470.0  
204     5400      19          25  22625.0  

[205 rows x 26 columns]>
In [9]:
car.sort_values
Out[9]:
<bound method DataFrame.sort_values of      car_ID  symboling                   CarName fueltype aspiration  \
0         1          3        alfa-romero giulia      gas        std   
1         2          3       alfa-romero stelvio      gas        std   
2         3          1  alfa-romero Quadrifoglio      gas        std   
3         4          2               audi 100 ls      gas        std   
4         5          2                audi 100ls      gas        std   
..      ...        ...                       ...      ...        ...   
200     201         -1           volvo 145e (sw)      gas        std   
201     202         -1               volvo 144ea      gas      turbo   
202     203         -1               volvo 244dl      gas        std   
203     204         -1                 volvo 246   diesel      turbo   
204     205         -1               volvo 264gl      gas      turbo   

    doornumber      carbody drivewheel enginelocation  wheelbase  ...  \
0          two  convertible        rwd          front       88.6  ...   
1          two  convertible        rwd          front       88.6  ...   
2          two    hatchback        rwd          front       94.5  ...   
3         four        sedan        fwd          front       99.8  ...   
4         four        sedan        4wd          front       99.4  ...   
..         ...          ...        ...            ...        ...  ...   
200       four        sedan        rwd          front      109.1  ...   
201       four        sedan        rwd          front      109.1  ...   
202       four        sedan        rwd          front      109.1  ...   
203       four        sedan        rwd          front      109.1  ...   
204       four        sedan        rwd          front      109.1  ...   

     enginesize  fuelsystem  boreratio  stroke compressionratio horsepower  \
0           130        mpfi       3.47    2.68              9.0        111   
1           130        mpfi       3.47    2.68              9.0        111   
2           152        mpfi       2.68    3.47              9.0        154   
3           109        mpfi       3.19    3.40             10.0        102   
4           136        mpfi       3.19    3.40              8.0        115   
..          ...         ...        ...     ...              ...        ...   
200         141        mpfi       3.78    3.15              9.5        114   
201         141        mpfi       3.78    3.15              8.7        160   
202         173        mpfi       3.58    2.87              8.8        134   
203         145         idi       3.01    3.40             23.0        106   
204         141        mpfi       3.78    3.15              9.5        114   

     peakrpm citympg  highwaympg    price  
0       5000      21          27  13495.0  
1       5000      21          27  16500.0  
2       5000      19          26  16500.0  
3       5500      24          30  13950.0  
4       5500      18          22  17450.0  
..       ...     ...         ...      ...  
200     5400      23          28  16845.0  
201     5300      19          25  19045.0  
202     5500      18          23  21485.0  
203     4800      26          27  22470.0  
204     5400      19          25  22625.0  

[205 rows x 26 columns]>
In [10]:
car.isnull()
Out[10]:
car_ID symboling CarName fueltype aspiration doornumber carbody drivewheel enginelocation wheelbase ... enginesize fuelsystem boreratio stroke compressionratio horsepower peakrpm citympg highwaympg price
0 False False False False False False False False False False ... False False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False False False False False False False ... False False False False False False False False False False
3 False False False False False False False False False False ... False False False False False False False False False False
4 False False False False False False False False False False ... False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
200 False False False False False False False False False False ... False False False False False False False False False False
201 False False False False False False False False False False ... False False False False False False False False False False
202 False False False False False False False False False False ... False False False False False False False False False False
203 False False False False False False False False False False ... False False False False False False False False False False
204 False False False False False False False False False False ... False False False False False False False False False False

205 rows × 26 columns

In [11]:
car.nunique()
Out[11]:
car_ID              205
symboling             6
CarName             147
fueltype              2
aspiration            2
doornumber            2
carbody               5
drivewheel            3
enginelocation        2
wheelbase            53
carlength            75
carwidth             44
carheight            49
curbweight          171
enginetype            7
cylindernumber        7
enginesize           44
fuelsystem            8
boreratio            38
stroke               37
compressionratio     32
horsepower           59
peakrpm              23
citympg              29
highwaympg           30
price               189
dtype: int64
In [12]:
print(car['CarName'].unique())
['alfa-romero giulia' 'alfa-romero stelvio' 'alfa-romero Quadrifoglio'
 'audi 100 ls' 'audi 100ls' 'audi fox' 'audi 5000' 'audi 4000'
 'audi 5000s (diesel)' 'bmw 320i' 'bmw x1' 'bmw x3' 'bmw z4' 'bmw x4'
 'bmw x5' 'chevrolet impala' 'chevrolet monte carlo' 'chevrolet vega 2300'
 'dodge rampage' 'dodge challenger se' 'dodge d200' 'dodge monaco (sw)'
 'dodge colt hardtop' 'dodge colt (sw)' 'dodge coronet custom'
 'dodge dart custom' 'dodge coronet custom (sw)' 'honda civic'
 'honda civic cvcc' 'honda accord cvcc' 'honda accord lx'
 'honda civic 1500 gl' 'honda accord' 'honda civic 1300' 'honda prelude'
 'honda civic (auto)' 'isuzu MU-X' 'isuzu D-Max ' 'isuzu D-Max V-Cross'
 'jaguar xj' 'jaguar xf' 'jaguar xk' 'maxda rx3' 'maxda glc deluxe'
 'mazda rx2 coupe' 'mazda rx-4' 'mazda glc deluxe' 'mazda 626' 'mazda glc'
 'mazda rx-7 gs' 'mazda glc 4' 'mazda glc custom l' 'mazda glc custom'
 'buick electra 225 custom' 'buick century luxus (sw)' 'buick century'
 'buick skyhawk' 'buick opel isuzu deluxe' 'buick skylark'
 'buick century special' 'buick regal sport coupe (turbo)'
 'mercury cougar' 'mitsubishi mirage' 'mitsubishi lancer'
 'mitsubishi outlander' 'mitsubishi g4' 'mitsubishi mirage g4'
 'mitsubishi montero' 'mitsubishi pajero' 'Nissan versa' 'nissan gt-r'
 'nissan rogue' 'nissan latio' 'nissan titan' 'nissan leaf' 'nissan juke'
 'nissan note' 'nissan clipper' 'nissan nv200' 'nissan dayz' 'nissan fuga'
 'nissan otti' 'nissan teana' 'nissan kicks' 'peugeot 504' 'peugeot 304'
 'peugeot 504 (sw)' 'peugeot 604sl' 'peugeot 505s turbo diesel'
 'plymouth fury iii' 'plymouth cricket' 'plymouth satellite custom (sw)'
 'plymouth fury gran sedan' 'plymouth valiant' 'plymouth duster'
 'porsche macan' 'porcshce panamera' 'porsche cayenne' 'porsche boxter'
 'renault 12tl' 'renault 5 gtl' 'saab 99e' 'saab 99le' 'saab 99gle'
 'subaru' 'subaru dl' 'subaru brz' 'subaru baja' 'subaru r1' 'subaru r2'
 'subaru trezia' 'subaru tribeca' 'toyota corona mark ii' 'toyota corona'
 'toyota corolla 1200' 'toyota corona hardtop' 'toyota corolla 1600 (sw)'
 'toyota carina' 'toyota mark ii' 'toyota corolla'
 'toyota corolla liftback' 'toyota celica gt liftback'
 'toyota corolla tercel' 'toyota corona liftback' 'toyota starlet'
 'toyota tercel' 'toyota cressida' 'toyota celica gt' 'toyouta tercel'
 'vokswagen rabbit' 'volkswagen 1131 deluxe sedan' 'volkswagen model 111'
 'volkswagen type 3' 'volkswagen 411 (sw)' 'volkswagen super beetle'
 'volkswagen dasher' 'vw dasher' 'vw rabbit' 'volkswagen rabbit'
 'volkswagen rabbit custom' 'volvo 145e (sw)' 'volvo 144ea' 'volvo 244dl'
 'volvo 245' 'volvo 264gl' 'volvo diesel' 'volvo 246']
In [13]:
duplicated_rows = car[car.duplicated()]
print(f"duplicated values are : {duplicated_rows.shape[0]}")
print(car.isnull().sum())
duplicated values are : 0
car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64
In [14]:
sns.pairplot(car)
Out[14]:
<seaborn.axisgrid.PairGrid at 0x284b6e77680>
No description has been provided for this image

Graph b/w Carname and price¶

In [16]:
print(car.head())
plt.figure(figsize=(10,30)) 
sns.scatterplot(x='price',y='CarName', data=car, color='grey') 
plt.title('Scatterplot of CarName V/S Price')
plt.xlabel('Price')
plt.ylabel('CarName')
plt.show()
   car_ID  symboling                   CarName fueltype aspiration doornumber  \
0       1          3        alfa-romero giulia      gas        std        two   
1       2          3       alfa-romero stelvio      gas        std        two   
2       3          1  alfa-romero Quadrifoglio      gas        std        two   
3       4          2               audi 100 ls      gas        std       four   
4       5          2                audi 100ls      gas        std       four   

       carbody drivewheel enginelocation  wheelbase  ...  enginesize  \
0  convertible        rwd          front       88.6  ...         130   
1  convertible        rwd          front       88.6  ...         130   
2    hatchback        rwd          front       94.5  ...         152   
3        sedan        fwd          front       99.8  ...         109   
4        sedan        4wd          front       99.4  ...         136   

   fuelsystem  boreratio  stroke compressionratio horsepower  peakrpm citympg  \
0        mpfi       3.47    2.68              9.0        111     5000      21   
1        mpfi       3.47    2.68              9.0        111     5000      21   
2        mpfi       2.68    3.47              9.0        154     5000      19   
3        mpfi       3.19    3.40             10.0        102     5500      24   
4        mpfi       3.19    3.40              8.0        115     5500      18   

   highwaympg    price  
0          27  13495.0  
1          27  16500.0  
2          26  16500.0  
3          30  13950.0  
4          22  17450.0  

[5 rows x 26 columns]
No description has been provided for this image
In [17]:
target_variable='price'
In [18]:
px.box(car.carlength,car.carwidth)
In [19]:
plt.figure(figsize=(15, 8))  # Set the size of the plot
car_counts = car['CarName'].value_counts().head(20)  # Show top 20 car names
sns.barplot(x=car_counts.index, y=car_counts.values, palette='Oranges')
plt.title('Top 20 Cars by Sales')
plt.xlabel('Car Name')
plt.ylabel('Number of Sales')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.show()
No description has been provided for this image
In [20]:
sns.histplot(car['price'], kde=True, bins=30, color='violet')
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image
In [21]:
sns.scatterplot(data=car, x='enginesize', y='price', hue='fueltype')
plt.title('Engine Size vs Price')
plt.xlabel('Engine Size')
plt.ylabel('Price')
plt.show()
No description has been provided for this image
In [22]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=car, x='carbody', y='price', palette='Set2')
plt.title('Price by Car Body Type')
plt.xlabel('Car Body Type')
plt.ylabel('Price')
plt.show()
No description has been provided for this image
In [23]:
sns.pairplot(car[['price', 'enginesize', 'horsepower', 'curbweight']])
plt.suptitle('Pair Plot of Key Features', y=1.02)
plt.show()
No description has been provided for this image
In [24]:
plt.figure(figsize=(19, 8))
sns.lineplot(data=car, x='horsepower', y='price', marker='o')
plt.title('Horsepower vs Price')
plt.xlabel('Horsepower')
plt.ylabel('Price')
plt.show()
No description has been provided for this image
In [25]:
sns.countplot(data=car, x='fueltype', palette='muted')
plt.title('Count of Cars by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.show()
No description has been provided for this image
In [26]:
fig = px.scatter(car,x='enginesize',y='price',color='fueltype',title='Engine Size vs Price',labels={'enginesize': 'Engine Size', 'price': 'Price'})
fig.update_layout(title_font_size=18,xaxis_title='Engine Size',yaxis_title='Price',height=600,width=800)
fig.show()
In [27]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=car['highwaympg'], y=car['price'], color='blue', alpha=0.7)
plt.title('Highway MPG vs Price', fontsize=16)
plt.xlabel('Highway MPG', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.grid(True)
plt.show()
No description has been provided for this image
In [28]:
correlation = car['highwaympg'].corr(car['price'])
print(f"Correlation between highwaympg and price: {correlation}")
Correlation between highwaympg and price: -0.6975990916465566

Exclude Non-Numeric Columns¶

In [30]:
print(car.columns)
Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')
In [31]:
car = car.drop(['CarName', 'car_ID'], axis=1)
In [32]:
# Select only numeric columns
numeric_data = car.select_dtypes(include=['float64', 'int64'])
# Compute the correlation matrix
correlation_matrix = numeric_data.corr()
# Display the correlation matrix
print(correlation_matrix)
                  symboling  wheelbase  carlength  carwidth  carheight  \
symboling          1.000000  -0.531954  -0.357612 -0.232919  -0.541038   
wheelbase         -0.531954   1.000000   0.874587  0.795144   0.589435   
carlength         -0.357612   0.874587   1.000000  0.841118   0.491029   
carwidth          -0.232919   0.795144   0.841118  1.000000   0.279210   
carheight         -0.541038   0.589435   0.491029  0.279210   1.000000   
curbweight        -0.227691   0.776386   0.877728  0.867032   0.295572   
enginesize        -0.105790   0.569329   0.683360  0.735433   0.067149   
boreratio         -0.130051   0.488750   0.606454  0.559150   0.171071   
stroke            -0.008735   0.160959   0.129533  0.182942  -0.055307   
compressionratio  -0.178515   0.249786   0.158414  0.181129   0.261214   
horsepower         0.070873   0.353294   0.552623  0.640732  -0.108802   
peakrpm            0.273606  -0.360469  -0.287242 -0.220012  -0.320411   
citympg           -0.035823  -0.470414  -0.670909 -0.642704  -0.048640   
highwaympg         0.034606  -0.544082  -0.704662 -0.677218  -0.107358   
price             -0.079978   0.577816   0.682920  0.759325   0.119336   

                  curbweight  enginesize  boreratio    stroke  \
symboling          -0.227691   -0.105790  -0.130051 -0.008735   
wheelbase           0.776386    0.569329   0.488750  0.160959   
carlength           0.877728    0.683360   0.606454  0.129533   
carwidth            0.867032    0.735433   0.559150  0.182942   
carheight           0.295572    0.067149   0.171071 -0.055307   
curbweight          1.000000    0.850594   0.648480  0.168790   
enginesize          0.850594    1.000000   0.583774  0.203129   
boreratio           0.648480    0.583774   1.000000 -0.055909   
stroke              0.168790    0.203129  -0.055909  1.000000   
compressionratio    0.151362    0.028971   0.005197  0.186110   
horsepower          0.750739    0.809769   0.573677  0.080940   
peakrpm            -0.266243   -0.244660  -0.254976 -0.067964   
citympg            -0.757414   -0.653658  -0.584532 -0.042145   
highwaympg         -0.797465   -0.677470  -0.587012 -0.043931   
price               0.835305    0.874145   0.553173  0.079443   

                  compressionratio  horsepower   peakrpm   citympg  \
symboling                -0.178515    0.070873  0.273606 -0.035823   
wheelbase                 0.249786    0.353294 -0.360469 -0.470414   
carlength                 0.158414    0.552623 -0.287242 -0.670909   
carwidth                  0.181129    0.640732 -0.220012 -0.642704   
carheight                 0.261214   -0.108802 -0.320411 -0.048640   
curbweight                0.151362    0.750739 -0.266243 -0.757414   
enginesize                0.028971    0.809769 -0.244660 -0.653658   
boreratio                 0.005197    0.573677 -0.254976 -0.584532   
stroke                    0.186110    0.080940 -0.067964 -0.042145   
compressionratio          1.000000   -0.204326 -0.435741  0.324701   
horsepower               -0.204326    1.000000  0.131073 -0.801456   
peakrpm                  -0.435741    0.131073  1.000000 -0.113544   
citympg                   0.324701   -0.801456 -0.113544  1.000000   
highwaympg                0.265201   -0.770544 -0.054275  0.971337   
price                     0.067984    0.808139 -0.085267 -0.685751   

                  highwaympg     price  
symboling           0.034606 -0.079978  
wheelbase          -0.544082  0.577816  
carlength          -0.704662  0.682920  
carwidth           -0.677218  0.759325  
carheight          -0.107358  0.119336  
curbweight         -0.797465  0.835305  
enginesize         -0.677470  0.874145  
boreratio          -0.587012  0.553173  
stroke             -0.043931  0.079443  
compressionratio    0.265201  0.067984  
horsepower         -0.770544  0.808139  
peakrpm            -0.054275 -0.085267  
citympg             0.971337 -0.685751  
highwaympg          1.000000 -0.697599  
price              -0.697599  1.000000  

Encode Categorical Columns¶

In [34]:
categorical_columns = ['fueltype', 'aspiration', 'doornumber', 'carbody', 
                       'drivewheel', 'enginelocation', 'enginetype', 
                       'cylindernumber', 'fuelsystem']
for col in categorical_columns:
    car[col] = LabelEncoder().fit_transform(car[col])
car = pd.get_dummies(car, columns=['fueltype', 'aspiration', 'doornumber', 
                                   'carbody', 'drivewheel', 'enginelocation', 
                                   'enginetype', 'cylindernumber', 'fuelsystem'], drop_first=True)
In [35]:
correlation_matrix = car.corr()
print(correlation_matrix)
                  symboling  wheelbase  carlength  carwidth  carheight  \
symboling          1.000000  -0.531954  -0.357612 -0.232919  -0.541038   
wheelbase         -0.531954   1.000000   0.874587  0.795144   0.589435   
carlength         -0.357612   0.874587   1.000000  0.841118   0.491029   
carwidth          -0.232919   0.795144   0.841118  1.000000   0.279210   
carheight         -0.541038   0.589435   0.491029  0.279210   1.000000   
curbweight        -0.227691   0.776386   0.877728  0.867032   0.295572   
enginesize        -0.105790   0.569329   0.683360  0.735433   0.067149   
boreratio         -0.130051   0.488750   0.606454  0.559150   0.171071   
stroke            -0.008735   0.160959   0.129533  0.182942  -0.055307   
compressionratio  -0.178515   0.249786   0.158414  0.181129   0.261214   
horsepower         0.070873   0.353294   0.552623  0.640732  -0.108802   
peakrpm            0.273606  -0.360469  -0.287242 -0.220012  -0.320411   
citympg           -0.035823  -0.470414  -0.670909 -0.642704  -0.048640   
highwaympg         0.034606  -0.544082  -0.704662 -0.677218  -0.107358   
price             -0.079978   0.577816   0.682920  0.759325   0.119336   
fueltype_1         0.194311  -0.308346  -0.212679 -0.233880  -0.284631   
aspiration_1      -0.059866   0.257611   0.234539  0.300567   0.087311   
doornumber_1       0.664073  -0.447357  -0.398568 -0.207168  -0.552208   
carbody_1          0.168845  -0.008608   0.047292  0.066360  -0.072328   
carbody_2          0.435648  -0.386094  -0.436269 -0.222308  -0.477476   
carbody_3         -0.378341   0.291086   0.269647  0.154637   0.235863   
carbody_4         -0.298243   0.210899   0.219683  0.060639   0.459148   
drivewheel_1       0.102839  -0.460355  -0.508714 -0.472116  -0.100273   
drivewheel_2      -0.076381   0.498830   0.538370  0.511149   0.039814   
enginelocation_1   0.212471  -0.187790  -0.050989 -0.051698  -0.106234   
enginetype_1       0.009347  -0.004156   0.009391  0.209136  -0.092628   
enginetype_2      -0.133979   0.399603   0.261715  0.210771   0.319687   
enginetype_3      -0.082855  -0.204037  -0.274413 -0.286211   0.036260   
enginetype_4       0.037513  -0.183195  -0.118320 -0.124446  -0.046670   
enginetype_5      -0.013597   0.166152   0.244053  0.348869  -0.065063   
enginetype_6       0.245950  -0.081174  -0.057877 -0.013699  -0.238720   
cylindernumber_1  -0.090188   0.261182   0.259894  0.397690   0.152982   
cylindernumber_2  -0.034161  -0.309492  -0.400210 -0.523135   0.059696   
cylindernumber_3  -0.000238   0.145842   0.262981  0.209246  -0.049777   
cylindernumber_4   0.065707  -0.120709  -0.187445 -0.183473  -0.015076   
cylindernumber_5  -0.047012   0.037803   0.100413  0.153516  -0.170181   
cylindernumber_6   0.245950  -0.081174  -0.057877 -0.013699  -0.238720   
fuelsystem_1      -0.034069  -0.396505  -0.487237 -0.522594  -0.079418   
fuelsystem_2       0.212471  -0.070124  -0.049998 -0.011834  -0.206225   
fuelsystem_3      -0.194311   0.308346   0.212679  0.233880   0.284631   
fuelsystem_4       0.122067  -0.033294  -0.004831  0.012832  -0.101245   
fuelsystem_5       0.012532   0.348891   0.511374  0.461896   0.108685   
fuelsystem_6       0.181939  -0.117359  -0.079790 -0.046399  -0.278615   
fuelsystem_7       0.065707  -0.032129  -0.008245 -0.023158  -0.066778   

                  curbweight  enginesize  boreratio    stroke  \
symboling          -0.227691   -0.105790  -0.130051 -0.008735   
wheelbase           0.776386    0.569329   0.488750  0.160959   
carlength           0.877728    0.683360   0.606454  0.129533   
carwidth            0.867032    0.735433   0.559150  0.182942   
carheight           0.295572    0.067149   0.171071 -0.055307   
curbweight          1.000000    0.850594   0.648480  0.168790   
enginesize          0.850594    1.000000   0.583774  0.203129   
boreratio           0.648480    0.583774   1.000000 -0.055909   
stroke              0.168790    0.203129  -0.055909  1.000000   
compressionratio    0.151362    0.028971   0.005197  0.186110   
horsepower          0.750739    0.809769   0.573677  0.080940   
peakrpm            -0.266243   -0.244660  -0.254976 -0.067964   
citympg            -0.757414   -0.653658  -0.584532 -0.042145   
highwaympg         -0.797465   -0.677470  -0.587012 -0.043931   
price               0.835305    0.874145   0.553173  0.079443   
fueltype_1         -0.217275   -0.069594  -0.054451 -0.241829   
aspiration_1        0.324902    0.108217   0.212614  0.222982   
doornumber_1       -0.197379   -0.020742  -0.119258  0.011082   
carbody_1           0.098956    0.239363   0.208089  0.043215   
carbody_2          -0.287501   -0.216805  -0.227032  0.052316   
carbody_3           0.099425    0.088459   0.030517  0.035630   
carbody_4           0.164075   -0.027518   0.105719 -0.095084   
drivewheel_1       -0.666039   -0.518391  -0.583087  0.124397   
drivewheel_2        0.669987    0.565509   0.574105 -0.022325   
enginelocation_1    0.050468    0.196826   0.185042 -0.138455   
enginetype_1        0.109243    0.128248   0.158136 -0.032545   
enginetype_2        0.250124    0.016063   0.181729 -0.084688   
enginetype_3       -0.413293   -0.363334  -0.410383  0.366084   
enginetype_4       -0.080295   -0.016508   0.326798 -0.522808   
enginetype_5        0.400878    0.562403   0.119509 -0.044813   
enginetype_6       -0.039196   -0.184762   0.000127 -0.000187   
cylindernumber_1    0.264554    0.144878  -0.007797  0.176485   
cylindernumber_2   -0.576463   -0.631431  -0.164076 -0.111046   
cylindernumber_3    0.405490    0.511783   0.128365  0.068388   
cylindernumber_4   -0.143903   -0.111081  -0.108774 -0.050450   
cylindernumber_5    0.187964    0.335555   0.054482 -0.110878   
cylindernumber_6   -0.039196   -0.184762   0.000127 -0.000187   
fuelsystem_1       -0.577159   -0.442562  -0.353342 -0.234866   
fuelsystem_2       -0.040801   -0.166946   0.000110 -0.000162   
fuelsystem_3        0.217275    0.069594   0.054451  0.241829   
fuelsystem_4        0.034431    0.049033   0.070030  0.144263   
fuelsystem_5        0.520220    0.483520   0.419335 -0.110280   
fuelsystem_6       -0.002434    0.004490  -0.004213  0.251259   
fuelsystem_7        0.024052   -0.013327   0.025977 -0.005688   

                  compressionratio  ...  cylindernumber_4  cylindernumber_5  \
symboling                -0.178515  ...          0.065707         -0.047012   
wheelbase                 0.249786  ...         -0.120709          0.037803   
carlength                 0.158414  ...         -0.187445          0.100413   
carwidth                  0.181129  ...         -0.183473          0.153516   
carheight                 0.261214  ...         -0.015076         -0.170181   
curbweight                0.151362  ...         -0.143903          0.187964   
enginesize                0.028971  ...         -0.111081          0.335555   
boreratio                 0.005197  ...         -0.108774          0.054482   
stroke                    0.186110  ...         -0.050450         -0.110878   
compressionratio          1.000000  ...         -0.011354          0.023986   
horsepower               -0.204326  ...         -0.099600          0.280220   
peakrpm                  -0.435741  ...         -0.003697         -0.018411   
citympg                   0.324701  ...          0.233665         -0.131093   
highwaympg                0.265201  ...          0.226756         -0.140150   
price                     0.067984  ...         -0.071388          0.199634   
fueltype_1               -0.984356  ...          0.023020          0.023020   
aspiration_1              0.295541  ...         -0.032857         -0.032857   
doornumber_1             -0.177888  ...          0.079143          0.079143   
carbody_1                 0.029623  ...         -0.014109         -0.014109   
carbody_2                -0.202650  ...          0.097231         -0.050416   
carbody_3                 0.188286  ...         -0.065706          0.074604   
carbody_4                 0.016315  ...         -0.026093         -0.026093   
drivewheel_1             -0.062683  ...          0.058926         -0.083189   
drivewheel_2              0.105185  ...         -0.053740          0.091216   
enginelocation_1         -0.019762  ...         -0.008532         -0.008532   
enginetype_1             -0.002519  ...         -0.004902         -0.004902   
enginetype_2              0.219153  ...          0.280784         -0.017458   
enginetype_3              0.027545  ...         -0.112818         -0.112818   
enginetype_4             -0.084328  ...         -0.019672         -0.019672   
enginetype_5             -0.086649  ...         -0.018218          0.269069   
enginetype_6             -0.026436  ...         -0.009877         -0.009877   
cylindernumber_1          0.173360  ...         -0.016672         -0.016672   
cylindernumber_2         -0.012522  ...         -0.130168         -0.130168   
cylindernumber_3         -0.065559  ...         -0.025495         -0.025495   
cylindernumber_4         -0.011354  ...          1.000000         -0.004902   
cylindernumber_5          0.023986  ...         -0.004902          1.000000   
cylindernumber_6         -0.026436  ...         -0.009877         -0.009877   
fuelsystem_1             -0.183384  ...          0.101606         -0.048245   
fuelsystem_2             -0.022838  ...         -0.008532         -0.008532   
fuelsystem_3              0.984356  ...         -0.023020         -0.023020   
fuelsystem_4             -0.055528  ...         -0.004902         -0.004902   
fuelsystem_5             -0.311035  ...         -0.064430          0.076082   
fuelsystem_6             -0.153726  ...         -0.015003         -0.015003   
fuelsystem_7             -0.016654  ...         -0.004902         -0.004902   

                  cylindernumber_6  fuelsystem_1  fuelsystem_2  fuelsystem_3  \
symboling                 0.245950     -0.034069      0.212471     -0.194311   
wheelbase                -0.081174     -0.396505     -0.070124      0.308346   
carlength                -0.057877     -0.487237     -0.049998      0.212679   
carwidth                 -0.013699     -0.522594     -0.011834      0.233880   
carheight                -0.238720     -0.079418     -0.206225      0.284631   
curbweight               -0.039196     -0.577159     -0.040801      0.217275   
enginesize               -0.184762     -0.442562     -0.166946      0.069594   
boreratio                 0.000127     -0.353342      0.000110      0.054451   
stroke                   -0.000187     -0.234866     -0.000162      0.241829   
compressionratio         -0.026436     -0.183384     -0.022838      0.984356   
horsepower                0.019250     -0.541966     -0.009630     -0.163926   
peakrpm                   0.259380     -0.095625      0.224073     -0.476883   
citympg                  -0.183076      0.520751     -0.153487      0.255963   
highwaympg               -0.159173      0.528009     -0.137506      0.191392   
price                    -0.004544     -0.501374     -0.017306      0.105679   
fueltype_1                0.046383      0.226565      0.040070     -1.000000   
aspiration_1             -0.066203     -0.323378     -0.057191      0.401397   
doornumber_1              0.159463     -0.020525      0.137757     -0.191491   
carbody_1                -0.028428     -0.084946     -0.024558      0.018635   
carbody_2                 0.195907      0.120288      0.169240     -0.202093   
carbody_3                -0.132390     -0.060830     -0.114369      0.185623   
carbody_4                -0.052573      0.030349     -0.045417      0.028183   
drivewheel_1             -0.167615      0.410403     -0.144799     -0.090342   
drivewheel_2              0.183789     -0.464056      0.158772      0.122035   
enginelocation_1         -0.017192     -0.083975     -0.014851     -0.040070   
enginetype_1             -0.009877     -0.048245     -0.008532     -0.023020   
enginetype_2             -0.035176     -0.127347     -0.030388      0.268163   
enginetype_3             -0.227314      0.217909     -0.196371      0.020584   
enginetype_4             -0.039637      0.127119     -0.034242     -0.092384   
enginetype_5             -0.036707     -0.179302     -0.031711     -0.085556   
enginetype_6              1.000000     -0.097207      0.863879     -0.046383   
cylindernumber_1         -0.033591     -0.164082     -0.029019      0.213527   
cylindernumber_2         -0.262272      0.345607     -0.226571     -0.020184   
cylindernumber_3         -0.051369     -0.250917     -0.044376     -0.068594   
cylindernumber_4         -0.009877      0.101606     -0.008532     -0.023020   
cylindernumber_5         -0.009877     -0.048245     -0.008532     -0.023020   
cylindernumber_6          1.000000     -0.097207      0.863879     -0.046383   
fuelsystem_1             -0.097207      1.000000     -0.083975     -0.226565   
fuelsystem_2              0.863879     -0.083975      1.000000     -0.040070   
fuelsystem_3             -0.046383     -0.226565     -0.040070      1.000000   
fuelsystem_4             -0.009877     -0.048245     -0.008532     -0.023020   
fuelsystem_5             -0.059039     -0.634114     -0.112147     -0.302574   
fuelsystem_6             -0.030229     -0.147658     -0.026114     -0.070457   
fuelsystem_7             -0.009877     -0.048245     -0.008532     -0.023020   

                  fuelsystem_4  fuelsystem_5  fuelsystem_6  fuelsystem_7  
symboling             0.122067      0.012532      0.181939      0.065707  
wheelbase            -0.033294      0.348891     -0.117359     -0.032129  
carlength            -0.004831      0.511374     -0.079790     -0.008245  
carwidth              0.012832      0.461896     -0.046399     -0.023158  
carheight            -0.101245      0.108685     -0.278615     -0.066778  
curbweight            0.034431      0.520220     -0.002434      0.024052  
enginesize            0.049033      0.483520      0.004490     -0.013327  
boreratio             0.070030      0.419335     -0.004213      0.025977  
stroke                0.144263     -0.110280      0.251259     -0.005688  
compressionratio     -0.055528     -0.311035     -0.153726     -0.016654  
horsepower            0.072562      0.628372      0.117664     -0.025056  
peakrpm              -0.018411      0.149959      0.068748     -0.018411  
citympg              -0.066724     -0.644489     -0.123954     -0.013083  
highwaympg           -0.068807     -0.610813     -0.106615     -0.017848  
price                -0.002747      0.517075     -0.061475     -0.019580  
fueltype_1            0.023020      0.302574      0.070457      0.023020  
aspiration_1          0.149190     -0.050041      0.394703     -0.032857  
doornumber_1          0.079143     -0.025019      0.146272      0.079143  
carbody_1            -0.014109      0.117876     -0.043182     -0.014109  
carbody_2             0.097231     -0.208463      0.197165      0.097231  
carbody_3            -0.065706      0.078094     -0.105671     -0.065706  
carbody_4            -0.026093      0.016053     -0.079860     -0.026093  
drivewheel_1          0.058926     -0.437655      0.132020     -0.083189  
drivewheel_2         -0.053740      0.448977     -0.115182      0.091216  
enginelocation_1     -0.008532      0.132429     -0.026114     -0.008532  
enginetype_1         -0.004902      0.076082     -0.015003     -0.004902  
enginetype_2         -0.017458      0.020749     -0.053432     -0.017458  
enginetype_3          0.043450     -0.302922      0.132984      0.043450  
enginetype_4         -0.019672      0.004585     -0.060209     -0.019672  
enginetype_5         -0.018218      0.282760     -0.055759     -0.018218  
enginetype_6         -0.009877     -0.059039     -0.030229     -0.009877  
cylindernumber_1     -0.016672      0.084981     -0.051026     -0.016672  
cylindernumber_2      0.037659     -0.373291      0.115259      0.037659  
cylindernumber_3     -0.025495      0.365248     -0.078030     -0.025495  
cylindernumber_4     -0.004902     -0.064430     -0.015003     -0.004902  
cylindernumber_5     -0.004902      0.076082     -0.015003     -0.004902  
cylindernumber_6     -0.009877     -0.059039     -0.030229     -0.009877  
fuelsystem_1         -0.048245     -0.634114     -0.147658     -0.048245  
fuelsystem_2         -0.008532     -0.112147     -0.026114     -0.008532  
fuelsystem_3         -0.023020     -0.302574     -0.070457     -0.023020  
fuelsystem_4          1.000000     -0.064430     -0.015003     -0.004902  
fuelsystem_5         -0.064430      1.000000     -0.197195     -0.064430  
fuelsystem_6         -0.015003     -0.197195      1.000000     -0.015003  
fuelsystem_7         -0.004902     -0.064430     -0.015003      1.000000  

[44 rows x 44 columns]
In [36]:
plt.figure(figsize=(35,20))
correlation_matrix = car.corr()  # Calculate correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=1)
plt.title('Correlation Matrix')
plt.show()
No description has been provided for this image

Data Preprocessing¶

In [38]:
# Impute missing values (if any)
car.fillna(car.mean(), inplace=True)

Normalize / Scale the Data¶

In [40]:
scaler = StandardScaler()
car[['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'horsepower', 'citympg', 'highwaympg']] = scaler.fit_transform(
    car[['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'horsepower', 'citympg', 'highwaympg']]
)

Splitting the data ( Train-Test Split )¶

In [42]:
X = car.drop('price', axis=1)  # Features
y = car['price']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model 1¶

Linear Regression¶

In [45]:
# Initialize and train the model
linear_model = LinearRegression()

# Fit the model on the training data
linear_model.fit(X_train, y_train)

# Make predictions
y_train_pred = linear_model.predict(X_train)
y_test_pred = linear_model.predict(X_test)

-- Evaluation¶

In [47]:
# Calculate evaluation metrics
metrics = ['R²', 'MAE', 'MSE', 'RMSE']
train_scores = [r2_score(y_train, y_train_pred), mean_absolute_error(y_train, y_train_pred), 
                mean_squared_error(y_train, y_train_pred), np.sqrt(mean_squared_error(y_train, y_train_pred))]
test_scores = [r2_score(y_test, y_test_pred), mean_absolute_error(y_test, y_test_pred), 
               mean_squared_error(y_test, y_test_pred), np.sqrt(mean_squared_error(y_test, y_test_pred))]
# Create DataFrame for tabular display
results_df = pd.DataFrame({
    'Metric': metrics,
    'Train Score': train_scores,
    'Test Score': test_scores
})
print(results_df)
  Metric   Train Score    Test Score
0     R²  9.476416e-01  8.925567e-01
1    MAE  1.311951e+03  2.089383e+03
2    MSE  3.122540e+06  8.482008e+06
3   RMSE  1.767071e+03  2.912389e+03
In [48]:
# Plotting the evaluation metrics
fig, ax = plt.subplots(figsize=(8, 6))
x = np.arange(len(metrics))
ax.bar(x - 0.2, train_scores, 0.4, label='Train', color='blue')
ax.bar(x + 0.2, test_scores, 0.4, label='Test', color='orange')
ax.set_ylabel('Score')
ax.set_title('Evaluation Metrics for Linear Regression Model')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
plt.show()
No description has been provided for this image

Decision Tree Regression¶

In [50]:
# Initialize DecisionTreeRegressor model
dt_model = DecisionTreeRegressor()

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions
y_train_pred = dt_model.predict(X_train)
y_test_pred = dt_model.predict(X_test)

-- Evaluation¶

In [52]:
# Calculate evaluation metrics
metrics = ['R²', 'MAE', 'MSE', 'RMSE']
train_scores = [r2_score(y_train, y_train_pred), mean_absolute_error(y_train, y_train_pred), 
                mean_squared_error(y_train, y_train_pred), np.sqrt(mean_squared_error(y_train, y_train_pred))]
test_scores = [r2_score(y_test, y_test_pred), mean_absolute_error(y_test, y_test_pred), 
               mean_squared_error(y_test, y_test_pred), np.sqrt(mean_squared_error(y_test, y_test_pred))]

# Create DataFrame for tabular display
results_df = pd.DataFrame({
    'Metric': metrics,
    'Train Score': train_scores,
    'Test Score': test_scores
})

# Display the results table
print(results_df)
  Metric   Train Score    Test Score
0     R²      0.998654  8.896985e-01
1    MAE     64.664634  1.912711e+03
2    MSE  80289.710366  8.707643e+06
3   RMSE    283.354390  2.950872e+03
In [53]:
# Plotting the evaluation metrics
fig, ax = plt.subplots(figsize=(8, 6))
x = np.arange(len(metrics))
ax.bar(x - 0.2, train_scores, 0.4, label='Train', color='blue')
ax.bar(x + 0.2, test_scores, 0.4, label='Test', color='orange')
ax.set_ylabel('Score')
ax.set_title('Evaluation Metrics for Decision Tree Regressor')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
plt.show()
No description has been provided for this image

Random Forest Regression¶

In [55]:
# Initialize RandomForestRegressor model
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

-- Evaluation¶

In [57]:
# Calculate evaluation metrics
train_scores_rf = [r2_score(y_train, y_train_pred_rf), 
                   mean_absolute_error(y_train, y_train_pred_rf), 
                   mean_squared_error(y_train, y_train_pred_rf), 
                   np.sqrt(mean_squared_error(y_train, y_train_pred_rf))]
test_scores_rf = [r2_score(y_test, y_test_pred_rf), 
                  mean_absolute_error(y_test, y_test_pred_rf), 
                  mean_squared_error(y_test, y_test_pred_rf), 
                  np.sqrt(mean_squared_error(y_test, y_test_pred_rf))]

# Create DataFrame for tabular display
results_rf_df = pd.DataFrame({
    'Metric': metrics,
    'Train Score': train_scores_rf,
    'Test Score': test_scores_rf
})

# Display the results table
print("Random Forest Regressor Results:")
print(results_rf_df)
Random Forest Regressor Results:
  Metric    Train Score    Test Score
0     R²       0.985170  9.592418e-01
1    MAE     593.412141  1.229665e+03
2    MSE  884399.423900  3.217618e+06
3   RMSE     940.425129  1.793772e+03
In [58]:
# Plotting the evaluation metrics
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(x - 0.2, train_scores_rf, 0.4, label='Train', color='blue')
ax.bar(x + 0.2, test_scores_rf, 0.4, label='Test', color='orange')
ax.set_ylabel('Score')
ax.set_title('Evaluation Metrics for Random Forest Regressor')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
plt.show()
No description has been provided for this image

Gradient Boosting Regression¶

In [60]:
# Initialize GradientBoostingRegressor model
gb_model = GradientBoostingRegressor()

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions
y_train_pred_gb = gb_model.predict(X_train)
y_test_pred_gb = gb_model.predict(X_test)

-- Evaluation¶

In [62]:
# Calculate evaluation metrics
train_scores_gb = [r2_score(y_train, y_train_pred_gb), 
                   mean_absolute_error(y_train, y_train_pred_gb), 
                   mean_squared_error(y_train, y_train_pred_gb), 
                   np.sqrt(mean_squared_error(y_train, y_train_pred_gb))]
test_scores_gb = [r2_score(y_test, y_test_pred_gb), 
                  mean_absolute_error(y_test, y_test_pred_gb), 
                  mean_squared_error(y_test, y_test_pred_gb), 
                  np.sqrt(mean_squared_error(y_test, y_test_pred_gb))]

# Create DataFrame for tabular display
results_gb_df = pd.DataFrame({
    'Metric': metrics,
    'Train Score': train_scores_gb,
    'Test Score': test_scores_gb
})

# Display the results table
print("Gradient Boosting Regressor Results:")
print(results_gb_df)
Gradient Boosting Regressor Results:
  Metric    Train Score    Test Score
0     R²       0.992674  9.245913e-01
1    MAE     488.730316  1.717112e+03
2    MSE  436929.562447  5.953066e+06
3   RMSE     661.006477  2.439891e+03
In [63]:
# Plotting the evaluation metrics
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(x - 0.2, train_scores_gb, 0.4, label='Train', color='blue')
ax.bar(x + 0.2, test_scores_gb, 0.4, label='Test', color='orange')
ax.set_ylabel('Score')
ax.set_title('Evaluation Metrics for Gradient Boosting Regressor')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
plt.show()
No description has been provided for this image

Hyperparameter Tuning¶

In [65]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.8824698567313881

Model Comparison and Final Selection¶

In [67]:
# Plot Actual vs Predicted Prices for Random Forest
plt.scatter(y_test, y_test_pred_rf)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Random Forest - Actual vs Predicted Prices")
plt.show()
No description has been provided for this image
In [68]:
# Plot Residuals for Random Forest
residuals = y_test - y_test_pred_rf  # Calculate residuals
plt.scatter(y_test_pred_rf, residuals)  # Scatter plot of Predicted Prices vs Residuals
plt.axhline(0, color='red', linestyle='--')  # Add a horizontal line at 0 for reference
plt.xlabel("Predicted Prices")
plt.ylabel("Residuals")
plt.title("Random Forest - Residuals")
plt.show()
No description has been provided for this image

Cross-Validation¶

In [70]:
# Perform cross-validation
cv_scores = cross_val_score(RandomForestRegressor(), X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Cross-validation scores:", cv_scores)
Cross-validation scores: [-4843821.62845046 -2713120.09150086 -9877540.39946481 -3432621.10299219
 -9994351.9494586 ]

Feature Importance¶

In [72]:
# Get feature importance
feature_importance = rf_model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 10))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title("Feature Importance - Random Forest")
plt.show()
No description has been provided for this image

Model Evaluation on Test Data¶

In [74]:
# Final Evaluation
print(f"R²: {r2_score(y_test, y_test_pred_rf)}")
print(f"MAE: {mean_absolute_error(y_test, y_test_pred_rf)}")
print(f"MSE: {mean_squared_error(y_test, y_test_pred_rf)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred_rf))}")
R²: 0.9592417789384023
MAE: 1229.6652154471544
MSE: 3217617.854504465
RMSE: 1793.771962793617

Comparison Decision Tree, Random Forest, and Gradient Boosting¶

In [76]:
# Define the models' predictions
models = {
    "Decision Tree": {'train_pred': y_train_pred, 'test_pred': y_test_pred},
    "Random Forest": {'train_pred': y_train_pred_rf, 'test_pred': y_test_pred_rf},
    "Gradient Boosting": {'train_pred': y_train_pred_gb, 'test_pred': y_test_pred_gb}
}
# Initialize metrics
metrics = ['R²', 'MAE', 'MSE', 'RMSE']
results = []
# Calculate evaluation metrics for each model
for model_name, predictions in models.items():
    # Get train and test predictions
    y_train_pred = predictions['train_pred']
    y_test_pred = predictions['test_pred'] 
    # Calculate evaluation metrics for training and test data
    train_scores = [r2_score(y_train, y_train_pred), 
                    mean_absolute_error(y_train, y_train_pred), 
                    mean_squared_error(y_train, y_train_pred), 
                    np.sqrt(mean_squared_error(y_train, y_train_pred))]
    test_scores = [r2_score(y_test, y_test_pred), 
                   mean_absolute_error(y_test, y_test_pred), 
                   mean_squared_error(y_test, y_test_pred), 
                   np.sqrt(mean_squared_error(y_test, y_test_pred))]
    # Append results to the list
    results.append([model_name] + train_scores + test_scores)
# Create DataFrame for tabular display
results_df = pd.DataFrame(results, columns=['Model', 'Train R²', 'Train MAE', 'Train MSE', 'Train RMSE',
                                            'Test R²', 'Test MAE', 'Test MSE', 'Test RMSE'])
# Display the results table
print(results_df)
# Plotting the evaluation metrics
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(metrics))
width = 0.2  # Bar width for spacing
for idx, model_name in enumerate(models.keys()):
    # Train scores
    ax.bar(x - width + idx * width, results[idx][1:5], width, label=f'{model_name} Train', alpha=0.7)
    # Test scores
    ax.bar(x - width + idx * width + width, results[idx][5:], width, label=f'{model_name} Test', alpha=0.7)
ax.set_ylabel('Score')
ax.set_title('Evaluation Metrics for Regression Models')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
plt.tight_layout()
plt.show()
               Model  Train R²   Train MAE      Train MSE  Train RMSE  \
0      Decision Tree  0.998654   64.664634   80289.710366  283.354390   
1      Random Forest  0.985170  593.412141  884399.423900  940.425129   
2  Gradient Boosting  0.992674  488.730316  436929.562447  661.006477   

    Test R²     Test MAE      Test MSE    Test RMSE  
0  0.889699  1912.711390  8.707643e+06  2950.871598  
1  0.959242  1229.665215  3.217618e+06  1793.771963  
2  0.924591  1717.112402  5.953066e+06  2439.890577  
No description has been provided for this image

MLP Classifier¶

In [78]:
# Initialize MLPRegressor instead of MLPClassifier
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500)

# Train the model
mlp_model.fit(X_train, y_train)

# Make predictions
y_train_pred_mlp = mlp_model.predict(X_train)
y_test_pred_mlp = mlp_model.predict(X_test)

# Calculate regression metrics
print("MLP Train MAE:", mean_absolute_error(y_train, y_train_pred_mlp))
print("MLP Test MAE:", mean_absolute_error(y_test, y_test_pred_mlp))

print("MLP Train MSE:", mean_squared_error(y_train, y_train_pred_mlp))
print("MLP Test MSE:", mean_squared_error(y_test, y_test_pred_mlp))

print("MLP Train RMSE:", mean_squared_error(y_train, y_train_pred_mlp, squared=False))
print("MLP Test RMSE:", mean_squared_error(y_test, y_test_pred_mlp, squared=False))

print("MLP Train R²:", r2_score(y_train, y_train_pred_mlp))
print("MLP Test R²:", r2_score(y_test, y_test_pred_mlp))
MLP Train MAE: 5800.954912767774
MLP Test MAE: 6210.298123928328
MLP Train MSE: 61711063.26465425
MLP Test MSE: 81912313.64306048
MLP Train RMSE: 7855.638946938323
MLP Test RMSE: 9050.542173983858
MLP Train R²: -0.034763599243714305
MLP Test R²: -0.03759996932418308

Naive Bayes¶

In [80]:
# Initialize LabelEncoder
le = LabelEncoder()

# Combine y_train and y_test to fit the LabelEncoder on all possible labels
y_combined = np.concatenate([y_train, y_test])

# Fit the encoder on the combined data
le.fit(y_combined)

# Transform both y_train and y_test
y_train_categorical = le.transform(y_train)
y_test_categorical = le.transform(y_test)

# Initialize Naive Bayes model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train_categorical)

# Make predictions
y_train_pred_nb = nb_model.predict(X_train)
y_test_pred_nb = nb_model.predict(X_test)

# Calculate evaluation metrics
print("Naive Bayes Train Accuracy:", accuracy_score(y_train_categorical, y_train_pred_nb))
print("Naive Bayes Test Accuracy:", accuracy_score(y_test_categorical, y_test_pred_nb))

# Classification report
print("Classification Report for Naive Bayes:")
print(classification_report(y_test_categorical, y_test_pred_nb))
Naive Bayes Train Accuracy: 0.9695121951219512
Naive Bayes Test Accuracy: 0.04878048780487805
Classification Report for Naive Bayes:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
          10       1.00      1.00      1.00         1
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         1
          26       0.00      0.00      0.00         0
          28       0.00      0.00      0.00         0
          36       0.00      0.00      0.00         0
          38       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         0
          42       0.00      0.00      0.00         1
          44       0.00      0.00      0.00         1
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         1
          48       0.00      0.00      0.00         1
          49       0.00      0.00      0.00         0
          51       0.00      0.00      0.00         0
          52       0.00      0.00      0.00         1
          54       0.00      0.00      0.00         0
          55       0.00      0.00      0.00         1
          57       0.00      0.00      0.00         1
          58       0.00      0.00      0.00         1
          60       0.00      0.00      0.00         0
          61       0.00      0.00      0.00         1
          65       0.00      0.00      0.00         0
          66       0.00      0.00      0.00         1
          72       0.00      0.00      0.00         1
          78       0.00      0.00      0.00         1
          79       0.00      0.00      0.00         1
          82       0.00      0.00      0.00         1
          84       0.00      0.00      0.00         1
          87       0.00      0.00      0.00         1
          88       0.00      0.00      0.00         0
          95       0.00      0.00      0.00         1
          99       0.00      0.00      0.00         0
         100       0.00      0.00      0.00         1
         104       0.00      0.00      0.00         0
         105       0.00      0.00      0.00         1
         110       0.00      0.00      0.00         1
         112       0.00      0.00      0.00         1
         116       0.00      0.00      0.00         1
         117       0.00      0.00      0.00         1
         119       1.00      1.00      1.00         1
         125       0.00      0.00      0.00         1
         126       0.00      0.00      0.00         0
         134       0.00      0.00      0.00         0
         139       0.00      0.00      0.00         0
         151       0.00      0.00      0.00         1
         153       0.00      0.00      0.00         0
         154       0.00      0.00      0.00         1
         165       0.00      0.00      0.00         1
         171       0.00      0.00      0.00         1
         172       0.00      0.00      0.00         1
         173       0.00      0.00      0.00         1
         174       0.00      0.00      0.00         1
         176       0.00      0.00      0.00         0
         186       0.00      0.00      0.00         1
         187       0.00      0.00      0.00         1

    accuracy                           0.05        41
   macro avg       0.03      0.03      0.03        41
weighted avg       0.05      0.05      0.05        41

XGBoost¶

In [82]:
# Initialize XGBoost Regressor model
xg_model = xgb.XGBRegressor()

# Train the model
xg_model.fit(X_train, y_train)

# Make predictions
y_train_pred_xg = xg_model.predict(X_train)
y_test_pred_xg = xg_model.predict(X_test)

# Calculate evaluation metrics
print("XGBoost Train R2:", r2_score(y_train, y_train_pred_xg))
print("XGBoost Test R2:", r2_score(y_test, y_test_pred_xg))

# Calculate Mean Squared Error
print("XGBoost Train MSE:", mean_squared_error(y_train, y_train_pred_xg))
print("XGBoost Test MSE:", mean_squared_error(y_test, y_test_pred_xg))
XGBoost Train R2: 0.9986532746417341
XGBoost Test R2: 0.9341680985913954
XGBoost Train MSE: 80315.78791987168
XGBoost Test MSE: 5197034.999348447

SCR¶

In [84]:
# Initialize the SVR model
svr_model = SVR()

# Train the model
svr_model.fit(X_train, y_train)

# Make predictions
y_train_pred_svr = svr_model.predict(X_train)
y_test_pred_svr = svr_model.predict(X_test)

# Evaluate performance
print("SVR Train MSE:", mean_squared_error(y_train, y_train_pred_svr))
print("SVR Test MSE:", mean_squared_error(y_test, y_test_pred_svr))
print("SVR Train R2:", r2_score(y_train, y_train_pred_svr))
print("SVR Test R2:", r2_score(y_test, y_test_pred_svr))
SVR Train MSE: 66279908.96687816
SVR Test MSE: 87029871.44049665
SVR Train R2: -0.11137344799882753
SVR Test R2: -0.10242511681999389

Model Comparison and Evaluation¶

In [147]:
# Create a dictionary of the models and their corresponding predictions
models = {
    'SVR': y_test_pred_svr,
    'MLP': y_test_pred_mlp,
    'Naive Bayes': y_test_pred_nb,
    'XGBoost': y_test_pred_xg
}

# Store the evaluation metrics for regression
results = []

for model_name, y_pred in models.items():
    # Calculate mean squared error and R^2 score
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Append the results
    results.append({
        'Model': model_name,
        'Mean Squared Error': mse,
        'R-squared': r2
    })

# Convert the results into a DataFrame for easier visualization
results_df = pd.DataFrame(results)

# Display the results
print(results_df)
         Model  Mean Squared Error  R-squared
0          SVR        8.702987e+07  -0.102425
1          MLP        8.191231e+07  -0.037600
2  Naive Bayes        2.578659e+08  -2.266440
3      XGBoost        5.197035e+06   0.934168
In [ ]: